In [ ]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
1. Familiar with Python
2. Completed Chapter I: Models by Design
1. Create a model design template
2. Construct the stem and classifier component
3. Construct the learner component
4. Construct blocks with max pooling
5. Construct blocks with feature pooling
6. Compare maxpooling to feature pooling training differences
Let's create a model template based of the macro-architecture, which includes:
1. stem
2. learner
3. classifier
You fill in the blanks (replace the ??), make sure it passes the Python interpreter.
You will need to:
1. Add the activation function to the stem.
2. Pass the group parameters for each group to the group method.
3. Add global average pooling to the classifier.
In [ ]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Conv2D, ReLU, BatchNormalization, GlobalAveragePooling2D, MaxPooling2D
def stem(inputs):
outputs = Conv2D(32, (3, 3), strides=(1, 1), padding='same')(inputs)
outputs = BatchNormalization()(outputs)
# Add activation function
# HINT: implement Conv-BN-RE (post-activation batch normalization)
outputs = ??
return outputs
def learner(inputs, groups):
outputs = inputs
for group_params in groups:
# Pass the group parameters as python kwargs
# HINT: remember the string ** parameter syntax?
outputs = group(outputs, ??)
return outputs
def group(inputs, **blocks):
outputs = inputs
for block in blocks:
pass
return outputs
def classifier(inputs, n_classes):
# Flatten and reduce the feature maps to single pixel each.
# Hint: it is method with 'Global' in the name.
outputs = ??
outputs = Dense(n_classes, activation='softmax')(outputs)
return outputs
# Create the input tensor
inputs = Input((32, 32, 3))
# Assemble the components of the model
outputs = stem(inputs)
outputs = learner(outputs, [ {'blocks': [{'n_filters': 64}]},
{'blocks': [{'n_filters': 128}, {'n_filters':128}]}
])
outputs = classifier(outputs, 10)
# Put the model together
model = Model(inputs, outputs)
It should look like below:
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_8 (InputLayer) [(None, 32, 32, 3)] 0
_________________________________________________________________
conv2d_6 (Conv2D) (None, 32, 32, 32) 896
_________________________________________________________________
batch_normalization_6 (Batch (None, 32, 32, 32) 128
_________________________________________________________________
re_lu_5 (ReLU) (None, 32, 32, 32) 0
_________________________________________________________________
global_average_pooling2d (Gl (None, 32) 0
_________________________________________________________________
dense (Dense) (None, 10) 330
=================================================================
Total params: 1,354
Trainable params: 1,290
Non-trainable params: 64
_________________________________________________________________
In [ ]:
model.summary()
Next, we will complete the learner component by:
1. Design the group method
2. Design the block method using max pooling
You will need to:
1. Extract the blocks parameters for the group.
2. Extract the number of filters parameter for the block.
3. Add max pooling in block to downsample size of feature maps.
In [ ]:
def group(inputs, **blocks):
outputs = inputs
# Extract the blocks parameters from kwargs blocks
# Hint: the parameter blocks is a dictionary, and 'blocks' is the key
blocks = ??
for block_params in blocks:
outputs = block(outputs, **block_params)
return outputs
def block(inputs, **block):
# Extract the number of filters from the kwargs block
# HINT: key is n_filters
n_filters = ??
outputs = Conv2D(n_filters, (3, 3), strides=(1, 1), padding='same')(inputs)
outputs = BatchNormalization()(outputs)
outputs = ReLU()(outputs)
# Add max pooling layer to reduce feature maps by 75%
# HINT: defaults to strides=(2, 2), but you can specify it anyways
outputs = MaxPooling2D((2, 2))(outputs)
return outputs
inputs = Input((32, 32, 3))
outputs = stem(inputs)
outputs = learner(outputs, [ {'blocks': [{'n_filters': 64}]},
{'blocks': [{'n_filters': 128}, {'n_filters':128}]}
])
outputs = classifier(outputs, 10)
model_a = Model(inputs, outputs)
It should look like below:
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 32, 32, 3)] 0
_________________________________________________________________
conv2d_1 (Conv2D) (None, 32, 32, 32) 896
_________________________________________________________________
batch_normalization_1 (Batch (None, 32, 32, 32) 128
_________________________________________________________________
re_lu_1 (ReLU) (None, 32, 32, 32) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 32, 32, 64) 18496
_________________________________________________________________
batch_normalization_2 (Batch (None, 32, 32, 64) 256
_________________________________________________________________
re_lu_2 (ReLU) (None, 32, 32, 64) 0
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 16, 16, 64) 0
_________________________________________________________________
conv2d_3 (Conv2D) (None, 16, 16, 128) 73856
_________________________________________________________________
batch_normalization_3 (Batch (None, 16, 16, 128) 512
_________________________________________________________________
re_lu_3 (ReLU) (None, 16, 16, 128) 0
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 8, 8, 128) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, 8, 8, 128) 147584
_________________________________________________________________
batch_normalization_4 (Batch (None, 8, 8, 128) 512
_________________________________________________________________
re_lu_4 (ReLU) (None, 8, 8, 128) 0
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 4, 4, 128) 0
_________________________________________________________________
global_average_pooling2d_1 ( (None, 128) 0
_________________________________________________________________
dense_1 (Dense) (None, 10) 1290
=================================================================
Total params: 243,530
Trainable params: 242,826
Non-trainable params: 704
In [ ]:
model_a.summary()
In [ ]:
def block(inputs, **block):
n_filters = block['n_filters']
outputs = Conv2D(n_filters, (3, 3), strides=(1, 1), padding='same')(inputs)
outputs = BatchNormalization()(outputs)
outputs = ReLU()(outputs)
# Add a feature pooling convolution to reduce feature map size by 75%
# HINT: It is in the strides and padding
outputs = Conv2D(n_filters, (3, 3), ??)(outputs)
outputs = BatchNormalization()(outputs)
outputs = ReLU()(outputs)
return outputs
inputs = Input((32, 32, 3))
outputs = stem(inputs)
outputs = learner(outputs, [ {'blocks': [{'n_filters': 64}]},
{'blocks': [{'n_filters': 128}, {'n_filters':128}]}
])
outputs = classifier(outputs, 10)
model_b = Model(inputs, outputs)
It should look like below:
Layer (type) Output Shape Param #
=================================================================
input_25 (InputLayer) [(None, 32, 32, 3)] 0
_________________________________________________________________
conv2d_29 (Conv2D) (None, 32, 32, 32) 896
_________________________________________________________________
batch_normalization_29 (Batc (None, 32, 32, 32) 128
_________________________________________________________________
re_lu_28 (ReLU) (None, 32, 32, 32) 0
_________________________________________________________________
conv2d_30 (Conv2D) (None, 32, 32, 64) 18496
_________________________________________________________________
batch_normalization_30 (Batc (None, 32, 32, 64) 256
_________________________________________________________________
re_lu_29 (ReLU) (None, 32, 32, 64) 0
_________________________________________________________________
conv2d_31 (Conv2D) (None, 16, 16, 64) 36928
_________________________________________________________________
batch_normalization_31 (Batc (None, 16, 16, 64) 256
_________________________________________________________________
re_lu_30 (ReLU) (None, 16, 16, 64) 0
_________________________________________________________________
conv2d_32 (Conv2D) (None, 16, 16, 128) 73856
_________________________________________________________________
batch_normalization_32 (Batc (None, 16, 16, 128) 512
_________________________________________________________________
re_lu_31 (ReLU) (None, 16, 16, 128) 0
_________________________________________________________________
conv2d_33 (Conv2D) (None, 8, 8, 128) 147584
_________________________________________________________________
batch_normalization_33 (Batc (None, 8, 8, 128) 512
_________________________________________________________________
re_lu_32 (ReLU) (None, 8, 8, 128) 0
_________________________________________________________________
conv2d_34 (Conv2D) (None, 8, 8, 128) 147584
_________________________________________________________________
batch_normalization_34 (Batc (None, 8, 8, 128) 512
_________________________________________________________________
re_lu_33 (ReLU) (None, 8, 8, 128) 0
_________________________________________________________________
conv2d_35 (Conv2D) (None, 4, 4, 128) 147584
_________________________________________________________________
batch_normalization_35 (Batc (None, 4, 4, 128) 512
_________________________________________________________________
re_lu_34 (ReLU) (None, 4, 4, 128) 0
_________________________________________________________________
global_average_pooling2d_8 ( (None, 128) 0
_________________________________________________________________
dense_8 (Dense) (None, 10) 1290
=================================================================
Total params: 576,906
Trainable params: 575,562
Non-trainable params: 1,344
In [ ]:
model_b.summary()
In [ ]:
from tensorflow.keras.datasets import cifar10
import numpy as np
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = (x_train / 255.0).astype(np.float32)
x_test = (x_test / 255.0).astype(np.float32)
In [ ]:
model_a.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
model_a.fit(x_train, y_train, epochs=3, batch_size=32, validation_split=0.1, verbose=1)
model_a.evaluate(x_test, y_test)
In [ ]:
model_b.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
model_b.fit(x_train, y_train, epochs=3, batch_size=32, validation_split=0.1, verbose=1)
model_b.evaluate(x_test, y_test)
Since we replaced the max pooling with feature pooling, you see a modest increase in the training time to additionally train the feature pooling layer.
If you compare on an epoch by epoch between the two models, you generally will see little difference on the first two epochs, and then gradually the feature pooling will increase in accuracy faster than the max pooling. The delay is due to the fact that it takes a bit of training to teach the feature pooling layer to optimize pooling the feature maps.
In otherwords, it is random and thus starts worse than max pooling (static algorithm) and gradually learns.